This package aims to perform metabolic pathway-based subytping of patient tumor samples from gene expression data.
The main steps are:
1. Perform GSVA on cancer patient gene expression data using KEGG metabolic pathways.
2. Perform k-means clustering on the GSVA matrix, identify metabolic subtypes. Optimal number of k is defined based on data or can be user-specified.
3. Summarize pathway activity per cluster: i.e. mean pathway activity per cluster, or do differential testing.
4. Run PROGENy on the gene expression data of tumors and compare signature activity scores between the identified clusters.
5. Perform Kaplan-Meier (KM) analysis with clusters if survival data are present.
The activity of metabolic pathways has a substantial effect on cancer progression and response to therapy. Therefore, the assessment of metabolic pathway expression and stratification of patients into clinically relevant subgroups have a high priority. The library MetabolicExpressR aims to provide an R library framework to carry out analyses within this concept. The main idea was developed in the paper of Danko et al., where metabolic subtyping of head and neck squamous cell carcinoma patients was performed with two metabolic subtypes uncovered showing differing response to adjuvant radio(chemo)therapy.
This vignette guides you through the main steps of metabolic subtyping of tumor samples using gene expression data.
# Set wd to current location:
current_path = rstudioapi::getActiveDocumentContext()$path
setwd(dirname(current_path))
########## Load libraries
library("tidyverse")
library("fgsea")
# devtools::install_github('dBenedek/MetabolicExpressR')
library("MetabolicExpressR")
########## Load data
# KEGG metabolic pathways gene set:
kegg_gs <- gmtPathways("../data/kegg_metabolic_human_20211026.gmt")
# CPTAC-HNSCC
gene_exp_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_RNAseq_RSEM_UQ_log2_Normal.cct",
sep = "\t", stringsAsFactors = F, header = T) %>%
column_to_rownames("Idx")
clinical_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_CLI.tsi",
sep = "\t", header = T, stringsAsFactors = F)
clinical_cptac_hnscc <- clinical_cptac_hnscc[-1, ] %>%
mutate(case_id = str_replace_all(case_id, "-", "\\."), overall_survival = as.numeric(overall_survival),
overall_free_status = as.numeric(overall_free_status), progression_free_status = as.numeric(progression_free_status),
progression_free_survival = as.numeric(progression_free_survival)) %>%
filter(P16 != "Positive (>70% nuclear and cytoplasmic staining)" & case_id %in%
colnames(gene_exp_cptac_hnscc))
# TCGA-ACC
gene_exp_tcga_acc <- read.table("../data/TCGA_ACC/ACC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_acc <- gene_exp_tcga_acc[-1, ] %>%
mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
column_to_rownames("Hybridization.REF") %>%
mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_acc) <- colnames(gene_exp_tcga_acc) %>%
str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_acc <- read.table("../data/TCGA_ACC/HS_CPTAC_ACC_CLI.tsi", sep = "\t",
header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_cptac_acc <- clinical_cptac_acc %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
# TCGA-UVM
gene_exp_tcga_uvm <- read.table("../data/TCGA_UVM/UVM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_uvm <- gene_exp_tcga_uvm[-1, ] %>%
mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
column_to_rownames("Hybridization.REF") %>%
mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_uvm) <- colnames(gene_exp_tcga_uvm) %>%
str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_uvm <- read.table("../data/TCGA_UVM/HS_CPTAC_UVM_CLI.tsi", sep = "\t",
header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_cptac_uvm <- clinical_cptac_uvm %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
# TCGA-LAML
gene_exp_tcga_laml <- read.table("../data/TCGA_LAML/LAML.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_laml <- gene_exp_tcga_laml[-1, ] %>%
mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
column_to_rownames("Hybridization.REF") %>%
mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_laml) <- colnames(gene_exp_tcga_laml) %>%
str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_laml <- read.table("../data/TCGA_LAML/HS_CPTAC_LAML_CLI.tsi", sep = "\t",
header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_cptac_laml <- clinical_cptac_laml %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
# CPTAC-COAD data:
gene_exp_cptac_coad <- read.table("../data/CPTAC_COAD/Human__CPTAC_COAD__UNC__RNAseq__HiSeq_RNA__03_01_2017__BCM__Gene__BCM_RSEM_UpperQuartile_log2.cct",
sep = "\t", stringsAsFactors = F, header = T) %>%
column_to_rownames("attrib_name")
# CPTAC-COAD: There is no survival days in survival data
# TCGA-COAD data:
gene_exp_tcga_coad <- read.table(gzfile("../data/TCGA_COAD/Human__TCGA_COADREAD__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct.gz"),
sep = "\t", stringsAsFactors = F, header = T) %>%
column_to_rownames("attrib_name")
clinical_tcga_coad <- read.table("../data/TCGA_COAD/HS_CPTAC_TCGA_COAD_CLI.tsi",
sep = "\t", header = T, stringsAsFactors = F) %>%
column_to_rownames("attrib_name") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("case_id")
clinical_tcga_coad <- clinical_tcga_coad %>%
mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))
GSVA: gene set variation analysis for microarray and RNA-Seq data
This might take some time, especially for larger (n > 100) data sets.
# CPTAC-HNSC data:
gsva_metab_cptac_hnscc <- run_gsva_metabolic(gene_exp_cptac_hnscc, kcdf = "Gaussian",
kegg_gs = kegg_gs, n_cores = 1L)
# TCGA-ACC data:
gsva_metab_tcga_acc <- run_gsva_metabolic(gene_exp_tcga_acc, kcdf = "Gaussian", kegg_gs = kegg_gs,
n_cores = 1L)
# TCGA-UVM data:
gsva_metab_tcga_uvm <- run_gsva_metabolic(gene_exp_tcga_uvm, kcdf = "Gaussian", kegg_gs = kegg_gs,
n_cores = 1L)
# TCGA-LAML data:
gsva_metab_tcga_laml <- run_gsva_metabolic(gene_exp_tcga_laml, kcdf = "Gaussian",
kegg_gs = kegg_gs, n_cores = 1L)
# TCGA-COAD data:
gsva_metab_tcga_coad <- run_gsva_metabolic(gene_exp_tcga_coad, kcdf = "Gaussian",
kegg_gs = kegg_gs, n_cores = 1L)
# CPTAC-COAD data:
gsva_metab_cptac_coad <- run_gsva_metabolic(gene_exp_cptac_coad, kcdf = "Gaussian",
kegg_gs = kegg_gs, n_cores = 1L)
It is recommended to use the optimal number of k for the clustering step. This is selected based on several indices included in the NbClust R library. Otherwise, you can set the parameters user_def_k = TRUE and k = ... in the kmeans_gsva_metabolic() function.
# CPTAC-HNSC data:
kmeans_clusters_cptac_hnscc <- kmeans_gsva_metabolic(gsva_metab_cptac_hnscc, kegg_gs = kegg_gs)
# TCGA-ACC data:
kmeans_clusters_tcga_acc <- kmeans_gsva_metabolic(gsva_metab_tcga_acc, kegg_gs = kegg_gs)
# TCGA-UVM data:
kmeans_clusters_tcga_uvm <- kmeans_gsva_metabolic(gsva_metab_tcga_uvm, kegg_gs = kegg_gs)
# TCGA-LAML data:
kmeans_clusters_tcga_laml <- kmeans_gsva_metabolic(gsva_metab_tcga_laml, kegg_gs = kegg_gs)
# TCGA-COAD data:
kmeans_clusters_tcga_coad <- kmeans_gsva_metabolic(gsva_metab_tcga_coad, kegg_gs = kegg_gs,
user_def_k = TRUE, k = 2)
# CPTAC-COAD data:
kmeans_clusters_cptac_coad <- kmeans_gsva_metabolic(gsva_metab_cptac_coad, kegg_gs = kegg_gs)
kmeans_clusters_cptac_hnscc$heatmap
kmeans_clusters_tcga_acc$heatmap
kmeans_clusters_tcga_uvm$heatmap
kmeans_clusters_tcga_laml$heatmap
kmeans_clusters_tcga_coad$heatmap
kmeans_clusters_cptac_coad$heatmap
plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_hnscc, kegg_gs = kegg_gs,
kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_acc, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_uvm, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_laml, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_coad, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)
plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_coad, kegg_gs = kegg_gs,
kmeans_res = kmeans_clusters_cptac_coad$kmean_res)
From the website of PROGENy: “PROGENy is resource that leverages a large compendium of publicly available signaling perturbation experiments to yield a common core of pathway responsive genes for human and mouse. These, coupled with any statistical method, can be used to infer pathway activities from bulk or single-cell transcriptomics.”
# CPTAC-HNSC data:
progeny_cptac_hnsc <- run_progeny(gene_exp_data = gene_exp_cptac_hnscc, kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)
progeny_cptac_hnsc$progeny_boxplot
# TCGA-ACC data:
progeny_tcga_acc <- run_progeny(gene_exp_data = gene_exp_tcga_acc, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)
progeny_tcga_acc$progeny_boxplot
# TCGA-UVM data:
progeny_tcga_uvm <- run_progeny(gene_exp_data = gene_exp_tcga_uvm, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)
progeny_tcga_uvm$progeny_boxplot
# TCGA-LAML data:
progeny_tcga_lam <- run_progeny(gene_exp_data = gene_exp_tcga_laml, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)
progeny_tcga_lam$progeny_boxplot
# TCGA-COAD data:
progeny_tcga_coad <- run_progeny(gene_exp_data = gene_exp_tcga_coad, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)
progeny_tcga_coad$progeny_boxplot
# CPTAC-COAD data:
progeny_cptac_coad <- run_progeny(gene_exp_data = gene_exp_cptac_coad, kmeans_res = kmeans_clusters_cptac_coad$kmean_res)
progeny_cptac_coad$progeny_boxplot
# CPTAC-HNSC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res, clinical_data = clinical_cptac_hnscc,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-ACC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_acc$kmean_res, clinical_data = clinical_cptac_acc,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-UVM data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_uvm$kmean_res, clinical_data = clinical_cptac_uvm,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-LAML data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_laml$kmean_res, clinical_data = clinical_cptac_laml,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
# TCGA-COAD data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_coad$kmean_res, clinical_data = clinical_tcga_coad,
sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] MetabolicExpressR_0.0.1.0 fgsea_1.30.0
## [3] lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4
## [7] purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1
## [11] ggplot2_3.5.1 tidyverse_2.0.0
## [13] BiocStyle_2.32.1
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0
## [3] jsonlite_1.8.8 shape_1.4.6.1
## [5] magrittr_2.0.3 magick_2.8.3
## [7] farver_2.1.1 rmarkdown_2.26
## [9] GlobalOptions_0.1.2 zlibbioc_1.50.0
## [11] vctrs_0.6.5 Cairo_1.6-2
## [13] memoise_2.0.1 tinytex_0.50
## [15] rstatix_0.7.2 htmltools_0.5.8.1
## [17] S4Arrays_1.4.0 broom_1.0.5
## [19] Rhdf5lib_1.26.0 SparseArray_1.4.0
## [21] rhdf5_2.48.0 sass_0.4.9
## [23] bslib_0.7.0 plyr_1.8.9
## [25] zoo_1.8-12 cachem_1.0.8
## [27] commonmark_1.9.1 lifecycle_1.0.4
## [29] iterators_1.0.14 pkgconfig_2.0.3
## [31] rsvd_1.0.5 Matrix_1.6-5
## [33] R6_2.5.1 fastmap_1.1.1
## [35] GenomeInfoDbData_1.2.12 MatrixGenerics_1.16.0
## [37] clue_0.3-65 digest_0.6.35
## [39] colorspace_2.1-0 AnnotationDbi_1.65.2
## [41] S4Vectors_0.42.0 DESeq2_1.44.0
## [43] irlba_2.3.5.1 GenomicRanges_1.55.4
## [45] RSQLite_2.3.6 ggpubr_0.6.0
## [47] beachmat_2.20.0 labeling_0.4.3
## [49] km.ci_0.5-6 fansi_1.0.6
## [51] timechange_0.3.0 httr_1.4.7
## [53] abind_1.4-5 compiler_4.4.1
## [55] bit64_4.0.5 withr_3.0.0
## [57] doParallel_1.0.17 backports_1.4.1
## [59] BiocParallel_1.38.0 viridis_0.6.5
## [61] carData_3.0-5 DBI_1.2.2
## [63] highr_0.10 HDF5Array_1.32.0
## [65] ggsignif_0.6.4 DelayedArray_0.30.0
## [67] rjson_0.2.21 tools_4.4.1
## [69] progeny_1.26.0 glue_1.7.0
## [71] rhdf5filters_1.16.0 gridtext_0.1.5
## [73] grid_4.4.1 reshape2_1.4.4
## [75] cluster_2.1.6 generics_0.1.3
## [77] gtable_0.3.5 KMsurv_0.1-5
## [79] tzdb_0.4.0 survminer_0.4.9
## [81] data.table_1.15.4 hms_1.1.3
## [83] xml2_1.3.6 car_3.1-2
## [85] BiocSingular_1.20.0 ScaledMatrix_1.12.0
## [87] utf8_1.2.4 XVector_0.44.0
## [89] BiocGenerics_0.50.0 markdown_1.12
## [91] ggrepel_0.9.5 foreach_1.5.2
## [93] pillar_1.9.0 GSVA_1.52.0
## [95] splines_4.4.1 circlize_0.4.16
## [97] ggtext_0.1.2 lattice_0.22-5
## [99] survival_3.7-0 bit_4.0.5
## [101] annotate_1.82.0 tidyselect_1.2.1
## [103] ComplexHeatmap_2.20.0 SingleCellExperiment_1.26.0
## [105] locfit_1.5-9.9 Biostrings_2.72.0
## [107] knitr_1.46 gridExtra_2.3
## [109] bookdown_0.41 IRanges_2.38.0
## [111] SummarizedExperiment_1.33.3 stats4_4.4.1
## [113] xfun_0.49 Biobase_2.64.0
## [115] matrixStats_1.3.0 stringi_1.8.3
## [117] UCSC.utils_1.0.0 yaml_2.3.8
## [119] evaluate_0.23 codetools_0.2-19
## [121] NbClust_3.0.1 BiocManager_1.30.22
## [123] graph_1.82.0 cli_3.6.2
## [125] xtable_1.8-4 munsell_0.5.1
## [127] jquerylib_0.1.4 survMisc_0.5.6
## [129] Rcpp_1.0.12 GenomeInfoDb_1.40.0
## [131] png_0.1-8 XML_3.99-0.16.1
## [133] parallel_4.4.1 blob_1.2.4
## [135] sparseMatrixStats_1.16.0 SpatialExperiment_1.13.2
## [137] viridisLite_0.4.2 GSEABase_1.66.0
## [139] scales_1.3.0 crayon_1.5.2
## [141] GetoptLong_1.0.5 rlang_1.1.3
## [143] cowplot_1.1.3 fastmatch_1.1-4
## [145] KEGGREST_1.44.0 formatR_1.14